{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Categorical Data in Pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hi Guys, Welcome to [Tirendaz Academy](https://youtube.com/c/tirendazacademy) 😀\n",
    "</br>\n",
    "In this notebook, I'm going to show categorical data in Pandas.\n",
    "</br>\n",
    "Happy Learning 🐱‍🏍 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How is a variable translated into categorical structure?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     Tim\n",
       "1     Tom\n",
       "2     Sam\n",
       "3     Sam\n",
       "4     Tim\n",
       "5     Tom\n",
       "6     Sam\n",
       "7     Sam\n",
       "8     Tim\n",
       "9     Tom\n",
       "10    Sam\n",
       "11    Sam\n",
       "dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=pd.Series([\"Tim\",\"Tom\",\"Sam\",\"Sam\"]*3)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Tim', 'Tom', 'Sam'], dtype=object)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.unique(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sam    6\n",
       "Tom    3\n",
       "Tim    3\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "values=pd.Series([0,1,0,0]*3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    Tim\n",
       "1    Sam\n",
       "0    Tim\n",
       "0    Tim\n",
       "0    Tim\n",
       "1    Sam\n",
       "0    Tim\n",
       "0    Tim\n",
       "0    Tim\n",
       "1    Sam\n",
       "0    Tim\n",
       "0    Tim\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names=pd.Series([\"Tim\",\"Sam\"])\n",
    "names.take(values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Categorical Type in Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     Tim\n",
       "1     Tom\n",
       "2     Sam\n",
       "3     Sam\n",
       "4     Tim\n",
       "5     Tom\n",
       "6     Sam\n",
       "7     Sam\n",
       "8     Tim\n",
       "9     Tom\n",
       "10    Sam\n",
       "11    Sam\n",
       "dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "N=len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.DataFrame(\n",
    "    {\"name\":data,\n",
    "     \"num\":np.arange(N),\n",
    "     \"score\":np.random.randint(40,100,\n",
    "                               size=N),\n",
    "     \"weight\":np.random.uniform(50,70,\n",
    "                                size=N)},\n",
    "    columns=[\"num\",\"name\",\"score\",\"weight\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>num</th>\n",
       "      <th>name</th>\n",
       "      <th>score</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Tim</td>\n",
       "      <td>90</td>\n",
       "      <td>58.608318</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Tom</td>\n",
       "      <td>99</td>\n",
       "      <td>67.616725</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Sam</td>\n",
       "      <td>70</td>\n",
       "      <td>58.181046</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Sam</td>\n",
       "      <td>96</td>\n",
       "      <td>56.833079</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Tim</td>\n",
       "      <td>82</td>\n",
       "      <td>55.952711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>Tom</td>\n",
       "      <td>89</td>\n",
       "      <td>52.296487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>Sam</td>\n",
       "      <td>97</td>\n",
       "      <td>53.203579</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7</td>\n",
       "      <td>Sam</td>\n",
       "      <td>96</td>\n",
       "      <td>63.967189</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8</td>\n",
       "      <td>Tim</td>\n",
       "      <td>45</td>\n",
       "      <td>57.324508</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>9</td>\n",
       "      <td>Tom</td>\n",
       "      <td>57</td>\n",
       "      <td>58.393265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10</td>\n",
       "      <td>Sam</td>\n",
       "      <td>74</td>\n",
       "      <td>67.731591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>Sam</td>\n",
       "      <td>80</td>\n",
       "      <td>55.372677</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    num name  score     weight\n",
       "0     0  Tim     90  58.608318\n",
       "1     1  Tom     99  67.616725\n",
       "2     2  Sam     70  58.181046\n",
       "3     3  Sam     96  56.833079\n",
       "4     4  Tim     82  55.952711\n",
       "5     5  Tom     89  52.296487\n",
       "6     6  Sam     97  53.203579\n",
       "7     7  Sam     96  63.967189\n",
       "8     8  Tim     45  57.324508\n",
       "9     9  Tom     57  58.393265\n",
       "10   10  Sam     74  67.731591\n",
       "11   11  Sam     80  55.372677"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     Tim\n",
       "1     Tom\n",
       "2     Sam\n",
       "3     Sam\n",
       "4     Tim\n",
       "5     Tom\n",
       "6     Sam\n",
       "7     Sam\n",
       "8     Tim\n",
       "9     Tom\n",
       "10    Sam\n",
       "11    Sam\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"name\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df[\"name\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     Tim\n",
       "1     Tom\n",
       "2     Sam\n",
       "3     Sam\n",
       "4     Tim\n",
       "5     Tom\n",
       "6     Sam\n",
       "7     Sam\n",
       "8     Tim\n",
       "9     Tom\n",
       "10    Sam\n",
       "11    Sam\n",
       "Name: name, dtype: category\n",
       "Categories (3, object): ['Sam', 'Tim', 'Tom']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "name_cat=df[\"name\"].astype(\"category\")\n",
    "name_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "x=name_cat.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Sam', 'Tim', 'Tom'], dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0], dtype=int8)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     Tim\n",
       "1     Tom\n",
       "2     Sam\n",
       "3     Sam\n",
       "4     Tim\n",
       "5     Tom\n",
       "6     Sam\n",
       "7     Sam\n",
       "8     Tim\n",
       "9     Tom\n",
       "10    Sam\n",
       "11    Sam\n",
       "Name: name, dtype: category\n",
       "Categories (3, object): ['Sam', 'Tim', 'Tom']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[\"name\"]=df[\"name\"].astype(\"category\")\n",
    "df.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a', 'b', 'c', 'd', 'e']\n",
       "Categories (5, object): ['a', 'b', 'c', 'd', 'e']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_cat=pd.Categorical(list(\"abcde\"))\n",
    "data_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['banana', 'apple', 'kiwi', 'banana', 'apple']\n",
       "Categories (3, object): ['apple', 'banana', 'kiwi']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Categorical([\"banana\", \"apple\", \n",
    "                \"kiwi\", \"banana\", \"apple\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['baby', 'child', 'young', 'old', 'child', 'baby', 'baby']\n",
       "Categories (4, object): ['baby', 'child', 'young', 'old']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "people=[\"baby\", \"child\", \"young\", \"old\"]\n",
    "codes=[0,1,2,3,1,0,0]\n",
    "people_cat=pd.Categorical.from_codes(\n",
    "    codes,people)\n",
    "people_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['baby', 'child', 'young', 'old', 'child', 'baby', 'baby']\n",
       "Categories (4, object): ['baby' < 'child' < 'young' < 'old']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "people_cat=pd.Categorical.from_codes(\n",
    "    codes,people,ordered=True)\n",
    "people_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['baby', 'child', 'young', 'old', 'child', 'baby', 'baby']\n",
       "Categories (4, object): ['baby' < 'child' < 'young' < 'old']"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "people_cat.as_ordered()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with Categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=np.random.randn(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(-2.9739999999999998, -0.668], (0.735, 3.402], (0.735, 3.402], (0.00973, 0.735], (-2.9739999999999998, -0.668], ..., (-0.668, 0.00973], (0.735, 3.402], (-0.668, 0.00973], (0.735, 3.402], (-2.9739999999999998, -0.668]]\n",
       "Length: 1000\n",
       "Categories (4, interval[float64]): [(-2.9739999999999998, -0.668] < (-0.668, 0.00973] < (0.00973, 0.735] < (0.735, 3.402]]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "interval=pd.qcut(data,4)\n",
    "interval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.arrays.categorical.Categorical"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(interval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Q1', 'Q4', 'Q4', 'Q3', 'Q1', ..., 'Q2', 'Q4', 'Q2', 'Q4', 'Q1']\n",
       "Length: 1000\n",
       "Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "interval=pd.qcut(data,4,labels=[\"Q1\",\"Q2\",\n",
    "                                \"Q3\",\"Q4\"])\n",
    "interval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "interval=pd.Series(interval,name=\"quarter\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>quarter</th>\n",
       "      <th>count</th>\n",
       "      <th>min</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Q1</td>\n",
       "      <td>250</td>\n",
       "      <td>-2.973073</td>\n",
       "      <td>-0.669205</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Q2</td>\n",
       "      <td>250</td>\n",
       "      <td>-0.667556</td>\n",
       "      <td>0.009061</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Q3</td>\n",
       "      <td>250</td>\n",
       "      <td>0.010393</td>\n",
       "      <td>0.733632</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Q4</td>\n",
       "      <td>250</td>\n",
       "      <td>0.737847</td>\n",
       "      <td>3.402463</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  quarter  count       min       max\n",
       "0      Q1    250 -2.973073 -0.669205\n",
       "1      Q2    250 -0.667556  0.009061\n",
       "2      Q3    250  0.010393  0.733632\n",
       "3      Q4    250  0.737847  3.402463"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(\n",
    "    data).groupby(\n",
    "    interval).agg([\"count\",\n",
    "                   \"min\",\n",
    "                   \"max\"]).reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3- How is the performance of categorical types?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "N=10000000\n",
    "num=pd.Series(np.random.randn(N))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "label=pd.Series([\"a\",\"b\",\"c\",\"d\"]*(N//4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat=label.astype(\"category\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "80000128"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label.memory_usage()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000320"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat.memory_usage()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4- What are categorical methods?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "s=pd.Series([\"a\",\"b\",\"c\",\"d\"]*2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    d\n",
       "4    a\n",
       "5    b\n",
       "6    c\n",
       "7    d\n",
       "dtype: category\n",
       "Categories (4, object): ['a', 'b', 'c', 'd']"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_ct=s.astype(\"category\")\n",
    "s_ct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    1\n",
       "2    2\n",
       "3    3\n",
       "4    0\n",
       "5    1\n",
       "6    2\n",
       "7    3\n",
       "dtype: int8"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_ct.cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['a', 'b', 'c', 'd'], dtype='object')"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_ct.cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    d\n",
       "4    a\n",
       "5    b\n",
       "6    c\n",
       "7    d\n",
       "dtype: category\n",
       "Categories (5, object): ['a', 'b', 'c', 'd', 'e']"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_ct=[\"a\",\"b\",\"c\",\"d\",\"e\"]\n",
    "s_ct.cat.set_categories(new_ct)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "4    a\n",
       "5    b\n",
       "dtype: category\n",
       "Categories (4, object): ['a', 'b', 'c', 'd']"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_ct=s_ct[s_ct.isin([\"a\",\"b\"])]\n",
    "s2_ct             "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "4    a\n",
       "5    b\n",
       "dtype: category\n",
       "Categories (2, object): ['a', 'b']"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_ct.cat.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5- How to create a dummy variable?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    a\n",
       "1    b\n",
       "2    c\n",
       "3    d\n",
       "4    a\n",
       "5    b\n",
       "6    c\n",
       "7    d\n",
       "dtype: category\n",
       "Categories (4, object): ['a', 'b', 'c', 'd']"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s_ct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  c  d\n",
       "0  1  0  0  0\n",
       "1  0  1  0  0\n",
       "2  0  0  1  0\n",
       "3  0  0  0  1\n",
       "4  1  0  0  0\n",
       "5  0  1  0  0\n",
       "6  0  0  1  0\n",
       "7  0  0  0  1"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.get_dummies(s_ct)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Don't forget to follow us on [YouTube](http://youtube.com/tirendazacademy) | [Medium](http://tirendazacademy.medium.com) | [Twitter](http://twitter.com/tirendazacademy) | [GitHub](http://github.com/tirendazacademy) | [Linkedin](https://www.linkedin.com/in/tirendaz-academy) | [Kaggle](https://www.kaggle.com/tirendazacademy) 😎"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
